## ============================================================================
## 05_si_s2_baseline.R
## ----------------------------------------------------------------------------
## Purpose:   Supplementary baseline analyses: weighted baseline means,
##            partisan sub-group comparisons, recall confidence distribution,
##            predictors of belief accuracy, and Australian opinion
##            poll benchmarks. Corresponds to SI Section S2.1.
##
## Inputs:    combined_data.rds (via 00_setup.R),
##            aus_target.rds, usa_target.rds, aes_2022.rds, anes_2024.rds, 
##            ces_2024.rds, au_national_polls.rds
## Outputs:   Figures/fig_s10.pdf,
##            Figures/fig_s11.pdf,
##            Tables/table_s17.tex,
##            Tables/table_s18.tex,
##            Tables/table_s19.tex,
##            Tables/table_s20.tex,
##            Tables/table_s21.tex,
##            Tables/table_s22.tex
## ============================================================================

source("00_setup.R")

## ---- Setup and data prep --------------------------------------------

## Load benchmark survey data 
aus_target <- read_rds("aus_target.rds")
usa_target <- read_rds("usa_target.rds")
aes_dat    <- read_rds("aes_2022.rds")
anes_dat   <- read_rds("anes_2024.rds")
ces_dat    <- read_rds("ces_2024.rds")

## Re-label party groups to match population estimates
combined_dat <-                                                                                                           
  combined_dat %>%                                                                                                      
  mutate(
    x_party = case_when(
      sample == "AU" & x_pid_short == "Liberal/National"            ~ "Coalition",
      sample == "AU" & x_pid_short %in% c("No party", "Other party") ~ "Other",
      .default = x_pid_short
    )
  )

## Subset of untreated respondents
subset_dat <-
  combined_dat %>%
  filter(Z_land_info == 0) %>%
  mutate(
    sample = factor(sample, levels = c("AU", "US"), labels = c("Australia", "United States"))
  )

## Specify outcome variables 
outcome_vars <- c("y_substantive", "y_recall_binary", "y_acknowledge_binary",
                  "y_infoseek_binary", "y_symbolic", "y_guilt", "y_cbr")

## Calculate group means
sample_means <- map_dfr(outcome_vars, function(y_var) {
  subset_dat %>%
    group_by(sample) %>%
    group_modify(~ tidy(lm_robust(reformulate("1", y_var), data = .x))) %>%
    ungroup() %>%
    filter(term == "(Intercept)") %>%
    mutate(outcome = y_var)
}) %>%
  mutate(outcome = factor(
    outcome,
    levels = c("y_recall_binary", "y_acknowledge_binary", "y_infoseek_binary",
               "y_guilt", "y_cbr", "y_symbolic", "y_substantive"),
    labels = c(
      "Indigenous Awareness",
      "Indigenous Land",
      "Information Seeking",
      "Collective Guilt",
      "Colorblind Racism",
      "Symbolic Reparations",
      "Substantive Reparations"
    )
  ))

## Partisanship weights for AU sample using first-preferences from the 2024 
## election results
aus_pid = c(
  "Labor" = 0.35,
  "Coalition" = 0.32,
  "Greens" = 0.12,
  "Other" = 0.21
)

aus_covs <- c("x_gender_f", "x_age_cat", "x_gcc", "x_edu_college", "x_native",
              "x_employ_binary", "x_hhi", "x_party")

aus_target <- list_assign(aus_target, x_party = aus_pid)

aus_weighted <-
  combined_dat %>%
  filter(sample == "AU") %>%
  harvest(., target = aus_target[aus_covs])

## Partisanship weights for US sample using ANES and CES proportions
ces_pid_7 <- c(
  "Strong Democrat"        = 0.23, "Democrat"               = 0.11,
  "Independent Democrat"   = 0.10, "Independent"            = 0.16,
  "Independent Republican" = 0.10, "Republican"             = 0.10,
  "Strong Republican"      = 0.20
)

anes_pid_7 <- c(
  "Strong Democrat"        = 0.23, "Democrat"               = 0.12,
  "Independent Democrat"   = 0.13, "Independent"            = 0.07,
  "Independent Republican" = 0.13, "Republican"             = 0.11,
  "Strong Republican"      = 0.21
)

ces_target <- list_assign(usa_target, x_pid_7 = ces_pid_7)
anes_target <- list_assign(usa_target, x_pid_7 = anes_pid_7)

usa_covs <- c("x_gender_f", "x_age_cat", "x_ethnicity", "x_region", "x_edu_college",
              "x_native", "x_employ_binary", "x_hhi", "x_pid_7")

## Add weights using CES partisanship
usa_weighted <-
  combined_dat %>%
  filter(sample == "US") %>%
  harvest(., target = ces_target[usa_covs], weight_column = "ces_weights")

## Add weights using ANES partisanship
usa_weighted$anes_weights <-
  combined_dat %>%
  filter(sample == "US") %>%
  harvest(., target = anes_target[usa_covs], weight_column = "anes_weights") %>%
  pull(anes_weights)

## Combine all versions for comparisons across weighting approaches
## Weighted means by sample and estimator
outcome_levs <- c(
  "y_recall_binary",
  "y_acknowledge_binary",
  "y_infoseek_binary",
  "y_guilt",
  "y_cbr",
  "y_symbolic",
  "y_substantive"
)

outcome_labs <- c(
  "Indigenous Awareness",
  "Indigenous Land",
  "Information Seeking",
  "Collective Guilt",
  "Colorblind Racism",
  "Symbolic Reparations",
  "Substantive Reparations"
)

## Outcome indices
outcome_indices <- c(
  "Collective Guilt",
  "Colorblind Racism",
  "Symbolic Reparations",
  "Substantive Reparations"
)

aus_subset <- aus_weighted %>% filter(Z_land_info == 0, sample == "AU")
usa_subset <- usa_weighted %>% filter(Z_land_info == 0, sample == "US")

aus_means_w <- map_dfr(outcome_levs, function(y_var) {
  tidy(lm_robust(reformulate("1", y_var), data = aus_subset,
                 weights = aus_subset$weights)) %>%
    mutate(outcome = y_var)
}) %>%
  mutate(outcome = factor(outcome, levels = outcome_levs, labels = outcome_labs),
         sample = "Australia", estimator = "Weighted")

usa_means_ces <- map_dfr(outcome_levs, function(y_var) {
  tidy(lm_robust(reformulate("1", y_var), data = usa_subset,
                 weights = usa_subset$ces_weights)) %>%
    mutate(outcome = y_var)
}) %>%
  mutate(outcome = factor(outcome, levels = outcome_levs, labels = outcome_labs),
         sample = "United States", estimator = "CES weighted")

usa_means_anes <- map_dfr(outcome_levs, function(y_var) {
  tidy(lm_robust(reformulate("1", y_var), data = usa_subset,
                 weights = usa_subset$anes_weights)) %>%
    mutate(outcome = y_var)
}) %>%
  mutate(outcome = factor(outcome, levels = outcome_levs, labels = outcome_labs),
         sample = "United States", estimator = "ANES weighted")

all_means <-
  sample_means %>%
  mutate(estimator = "Unweighted") %>%
  bind_rows(aus_means_w, usa_means_ces, usa_means_anes)

## ---- In text: weighting diagnostics by country and approach ------
aus_weighted %>%
  diagnose_weights(target = aus_target[aus_covs]) %>%
  summarise(error_original = mean(error_original),
            error_weighted = mean(error_weighted)) 

design_effect(aus_weighted$weights)
effective_sample_size(aus_weighted$weights)

usa_weighted %>%
  diagnose_weights(target = ces_target[usa_covs], weights = usa_weighted$ces_weights) %>%
  summarise(
    error_original = mean(error_original),
    error_weighted = mean(error_weighted)
  ) 

design_effect(usa_weighted$ces_weights)
effective_sample_size(usa_weighted$ces_weights)

usa_weighted %>%
  diagnose_weights(target = anes_target[usa_covs], weights = usa_weighted$anes_weights) %>%
  summarise(
    error_original = mean(error_original),
    error_weighted = mean(error_weighted)
  ) 

design_effect(usa_weighted$anes_weights)
effective_sample_size(usa_weighted$anes_weights)

## ---- Table S17: Unweighted baseline means and between-country differences ----
est_diff <-
  sample_means %>%
  select(sample, outcome, estimate, std.error, p.value) %>%
  pivot_wider(names_from = sample, values_from = c(estimate, std.error, p.value)) %>%
  mutate(
    diff_estimate = `estimate_Australia` - `estimate_United States`,
    diff_std_error = sqrt(`std.error_Australia`^2 + `std.error_United States`^2),
    diff_p_value = 2 * pnorm(abs(diff_estimate) / diff_std_error, lower.tail = FALSE),
    diff_ci_lwr = diff_estimate - 1.96*diff_std_error,
    diff_ci_upr = diff_estimate + 1.96*diff_std_error
  )

sample_means %>%
  filter(outcome %in% outcome_indices) %>%
  mutate(entry = table_entry(est = estimate, se = std.error)) %>%
  select(sample, outcome, entry) %>%
  pivot_wider(names_from = sample, values_from = entry) %>%
  left_join(
    est_diff %>%
      filter(outcome %in% outcome_indices) %>%
      mutate(
        diff    = table_entry(est = diff_estimate, se = diff_std_error),
        diff_ci = paste0("[", round(diff_ci_lwr, 2), ", ", round(diff_ci_upr, 2), "]"),
        diff_p  = case_when(
          diff_p_value < 0.001 ~ "< 0.001",
          .default = paste(round(diff_p_value, 2))
        )
      ) %>%
      select(outcome, diff, diff_ci, diff_p),
    by = "outcome"
  ) %>%
  mutate(outcome = factor(outcome, levels = outcome_indices)) %>%
  arrange(outcome) %>%
  xtable() %>%
  print(include.rownames = FALSE, include.colnames = FALSE, only.contents = TRUE,
        type = "latex", sanitize.text.function = identity, comment = FALSE,
        hline.after = NULL, file = paste0(tab_dir, "table_s17.tex"))


## ---- Figure S10: Weighted and unweighted baseline means for attitude indices ----                                      
shape_map <- c(                                                                                                           
  "Unweighted"   = 22,                                                                                                    
  "Weighted"     = 21,                                                                                                    
  "CES weighted" = 21,                                    
  "ANES weighted" = 23
)

# Position adjustment for dodge
p_adjust <- 0.9

p <-
  all_means %>%
  mutate(estimator = factor(
    estimator,
    levels = c("Unweighted", "Weighted", "CES weighted", "ANES weighted")
  )) %>%
  filter(outcome %in% c(
    "Collective Guilt", "Colorblind Racism",
    "Symbolic Reparations", "Substantive Reparations"
  )) %>%
  ggplot(aes(
    x = estimate, y = outcome,
    color = sample, fill = sample,
    shape = estimator,
    group = interaction(estimator, sample)
  )) +
  geom_vline(xintercept = 4, color = "black", linetype = "solid", linewidth = 0.2) +
  geom_errorbar(
    aes(xmin = conf.low, xmax = conf.high),
    width = 0,
    position = position_dodgev(height = p_adjust),
    linewidth = 1.5,
    orientation = "y"
  ) +
  geom_point(
    position = position_dodgev(height = p_adjust),
    size = 4,
    color = "white"
  ) +
  facet_col(~ outcome, space = "free", scales = "free_y") +
  scale_shape_manual("", values = shape_map) +
  scale_x_continuous(limits = c(3.4, 5.1), breaks = seq(3.5, 5, by = 0.5)) +
  scale_color_manual("", values = c("black", "darkgrey")) +
  scale_fill_manual("", values = c("black", "darkgrey")) +
  theme_tufte(base_size = 14) +
  labs(x = "Average score (1 - 7 scale)", y = "") +
  theme(
    axis.ticks.y      = element_blank(),
    axis.ticks.x      = element_line(linewidth = 0.25),
    axis.line.x       = element_line(linewidth = 0.25, color = "black"),
    axis.title.x      = element_text(size = 14),
    axis.text.x       = element_text(size = 12, color = "black"),
    axis.text.y       = element_blank(),
    strip.text        = element_text(size = 14, hjust = 0.5),
    plot.margin       = unit(c(t = 0, r = 0, b = 0, l = -1), "lines"),
    legend.position   = "none",
    legend.margin     = margin(0, 0, 0, 0),
    legend.box.spacing = unit(0, "pt"),
    legend.box.margin = margin(0, 0, 0, 0)
  )

g <-
  p +
  geom_label(
    data = p$data %>%
      filter(outcome == "Collective Guilt") %>%
      mutate(
        estimate = 4.6,
        estimator_label = case_when(
          estimator == "Unweighted"   & sample == "Australia"     ~ "Australia - Unweighted",
          estimator == "Unweighted"   & sample == "United States" ~ "U.S. - Unweighted",
          estimator == "Weighted"     & sample == "Australia"     ~ "Australia - Weighted",
          estimator == "CES weighted" & sample == "United States" ~ "U.S. - CES weighted",
          estimator == "ANES weighted"& sample == "United States" ~ "U.S. - ANES weighted"
        )
      ),
    aes(label = estimator_label),
    position = position_dodgev(height = 1.05),
    hjust = 0, color = "white", size = 3.25, show.legend = FALSE
  )

g

ggsave(
  filename = paste0(fig_dir, "fig_s10.pdf"),
  g,
  width = 5,
  height = 6
)

## ---- Table S18: Weighted baseline means and between-country differences ----
w_diff_tab <- function(us_means) {
  aus_means_w %>%
    select(outcome, est_au = estimate, se_au = std.error) %>%
    left_join(
      us_means %>% select(outcome, est_us = estimate, se_us = std.error),
      by = "outcome"
    ) %>%
    mutate(
      d    = est_au - est_us,
      d_se = sqrt(se_au^2 + se_us^2),
      diff = table_entry(est = d, se = d_se),
      ci   = paste0("[", round(d - 1.96*d_se, 2), ", ", round(d + 1.96*d_se, 2), "]"),
      p    = case_when(
        2*pnorm(abs(d)/d_se, lower.tail = FALSE) < 0.001 ~ "< 0.001",
        .default = paste(round(2*pnorm(abs(d)/d_se, lower.tail = FALSE), 3))
      )
    ) %>%
    select(outcome, diff, ci, p)
}

aus_means_w %>%
  mutate(aus_entry = table_entry(est = estimate, se = std.error)) %>%
  select(outcome, aus_entry) %>%
  left_join(
    usa_means_ces %>%
      mutate(ces_entry = table_entry(est = estimate, se = std.error)) %>%
      select(outcome, ces_entry),
    by = "outcome"
  ) %>%
  left_join(
    usa_means_anes %>%
      mutate(anes_entry = table_entry(est = estimate, se = std.error)) %>%
      select(outcome, anes_entry),
    by = "outcome"
  ) %>%
  left_join(
    w_diff_tab(usa_means_ces) %>% rename(diff_ces = diff, ci_ces = ci, p_ces = p),
    by = "outcome"
  ) %>%
  left_join(
    w_diff_tab(usa_means_anes) %>% rename(diff_anes = diff, ci_anes = ci, p_anes = p),
    by = "outcome"
  ) %>%
  xtable() %>%
  print(include.rownames = FALSE, include.colnames = FALSE, only.contents = TRUE,
        type = "latex", sanitize.text.function = identity, comment = FALSE,
        hline.after = NULL, file = paste0(tab_dir, "table_s18.tex"))

## ---- Tables S19-S20: Baseline differences among partisan groups by sample ----

## Calculate group means and differences by PID
dim_pid <- map_dfr(outcome_levs, function(y_var) {
  subset_dat %>%
    group_by(x_pid_comb2, sample) %>%
    group_modify( ~ tidy(lm_robust(reformulate("1", y_var), data = .x))) %>%
    ungroup() %>%
    filter(term == "(Intercept)") %>%
    mutate(outcome = y_var)
}) %>%
  mutate(outcome = factor(outcome, levels = outcome_levs, labels = outcome_labs))

pid_diff <-
  dim_pid %>%
  select(x_pid_comb2, sample, outcome, estimate, std.error, p.value) %>%
  pivot_wider(names_from = sample, values_from = c(estimate, std.error, p.value)) %>%
  mutate(
    diff_estimate = `estimate_Australia` - `estimate_United States`,
    diff_std_error = sqrt(`std.error_Australia`^2 + `std.error_United States`^2),
    diff_ci_lwr = diff_estimate - 1.96 * diff_std_error,
    diff_ci_upr = diff_estimate + 1.96 * diff_std_error,
    diff_p_value = 2 * pnorm(abs(diff_estimate) / diff_std_error, lower.tail = FALSE)
  ) %>%
  select(x_pid_comb2, outcome, diff_estimate, diff_std_error, diff_ci_lwr,
         diff_ci_upr, diff_p_value)

dim_pid_tab <-
  dim_pid %>%
  mutate(entry = table_entry(est = estimate, se = std.error)) %>%
  select(x_pid_comb2, sample, outcome, entry) %>%
  pivot_wider(names_from = sample, values_from = entry) %>%
  left_join(
    pid_diff %>%
      mutate(
        diff = table_entry(est = diff_estimate, se = diff_std_error),
        diff_ci = paste0("[", round(diff_ci_lwr, 2), ", ", round(diff_ci_upr, 2), "]"),
        diff_p = case_when(
          diff_p_value < 0.001 ~ "< 0.001",
          round(diff_p_value, 3) == 0.001 ~ "0.001",
          .default = paste(round(diff_p_value, 3))
        )
      ) %>%
      select(x_pid_comb2, outcome, diff, diff_ci, diff_p),
    by = c("x_pid_comb2", "outcome")
  ) %>%
  arrange(outcome) %>%
  mutate(x_pid_comb2 = factor(x_pid_comb2, levels = c("Left", "Independent", "Right"))) %>%
  rename(x_pid = x_pid_comb2)

## Table S19: Binary outcomes by partisan group 
dim_pid_tab %>%
  filter(!(
    outcome %in% outcome_indices
  )) %>%
  arrange(outcome, x_pid) %>%
  xtable() %>%
  print(
    include.rownames = FALSE,
    include.colnames = FALSE,
    only.contents = TRUE,
    type = "latex",
    sanitize.text.function = identity,
    comment = FALSE,
    hline.after = NULL,
    file = paste0(tab_dir, "table_s19.tex")
  )

## Table S20: Attitude indices by partisan group 
dim_pid_tab %>%
  filter(outcome %in% outcome_indices) %>%
  arrange(outcome, x_pid) %>%
  xtable() %>%
  print(
    include.rownames = FALSE,
    include.colnames = FALSE,
    only.contents = TRUE,
    type = "latex",
    sanitize.text.function = identity,
    comment = FALSE,
    hline.after = NULL,
    file = paste0(tab_dir, "table_s20.tex")
  )

## ---- Figure S11: Distribution of Indigenous Awareness responses by confidence level ----
plot_dat <-                                                                                                          
  combined_dat %>%
  filter(Z_land_info == 0, !is.na(y_recall_conf)) %>%
  mutate(
    y_recall_conf = str_replace(y_recall_conf, " confident", ""),
    y_recall_conf = factor(
      y_recall_conf,
      levels = c("Not at all", "Slightly", "Moderately", "Very", "Extremely"),
      labels = c("Not at all\nconfident", "Slightly\nconfident",
                 "Moderately\nconfident", "Very\nconfident",
                 "Extremely\nconfident")
    ),
    guess = factor(ifelse(y_recall_binary == 1, "Correct", "Incorrect"),
                   levels = c("Correct", "Incorrect"))
  )

## Generate plot showing all data points per journal requirements
plot_sum <-
  plot_dat %>%
  group_by(y_recall_conf) %>%
  summarise(
    n   = n(),
    est = mean(y_recall_binary, na.rm = TRUE),
    se  = sqrt(est * (1 - est) / n),
    lo  = est - 1.96 * se,
    hi  = est + 1.96 * se
  )

set.seed(123)                                                                                                             
plot_dat <- 
  plot_dat %>%                           
  mutate(
    .r     = sqrt(runif(n())) * 0.1,   # radius in y-axis units; tune this
    .theta = runif(n(), 0, 2 * pi),
    # aspect_fac corrects for the different visual scales of x and y axes
    # increase if blobs look too tall, decrease if too wide
    .x_jit = .r * cos(.theta) * 2.5,
    .y_jit = .r * sin(.theta)
  )

p <-
  ggplot(plot_dat,
         aes(x = as.numeric(y_recall_conf) + .x_jit,
             y = y_recall_binary + .y_jit,
             color = guess)) +
  geom_point(alpha = 0.5, size = 0.9) +
  geom_errorbar(
    data = plot_sum,
    aes(x = as.numeric(y_recall_conf), y = est, ymin = lo, ymax = hi, color = NULL),
    width = 0, linewidth = 0.8, color = "black"
  ) +
  geom_point(
    data = plot_sum,
    aes(x = as.numeric(y_recall_conf), y = est, color = NULL),
    size = 3.5, shape = 22, fill = "black", color = "white"
  ) +
  geom_text(
    data = plot_sum,
    aes(x = as.numeric(y_recall_conf) + 0.2, y = est, color = NULL,
        label = paste0(sprintf("%.0f", est*100), "%")),
    size = 3.5, color = "black"
  ) +
  scale_x_continuous(
    breaks = 1:5,
    labels = levels(plot_dat$y_recall_conf)
  ) +
  scale_color_manual("", values = c("Correct" = "darkgrey", "Incorrect" = "red")) +
  scale_y_continuous(breaks = seq(0, 1, 0.25)) +
  theme_tufte(base_size = 11) +
  theme(
    axis.ticks.y = element_blank(),
    axis.ticks.x = element_line(linewidth = 0.25),
    axis.line.x = element_line(linewidth = 0.25),
    axis.title.x = element_text(size = 10),
    axis.text.x = element_text(size = 11, margin = margin(5,0,0,0)),
    axis.text.y = element_blank(),
    plot.margin = unit(c(t = 0, r = .8, b = .2, l = .2), "lines"),
    legend.margin = margin(t = 0, r = -5, b = 0, l = -10),
    legend.box.spacing = unit(5, "pt"),
    legend.box.margin = margin(0, 0, 0, 0),
    legend.text = element_text(size = 10)
  ) +
  labs(x = NULL, y = NULL) +
  guides(color = guide_legend(override.aes = list(alpha = 1, size = 2)))

p

g <-
  p +
  coord_capped_cart(
    bottom = brackets_horizontal(tick.length = unit(.2, 'cm'), direction = "up"),
    gap = 0
  )

g

ggsave(
  filename = paste0(fig_dir, "fig_s11.pdf"),
  g,
  width = 6.5,
  height = 4
)

## ---- Table S21: Predictors of Indigenous Awareness by sample ----

## Fit for Australia
au_fit <- glm(
  y_recall_binary ~ x_gender_f + x_age_cat + x_state_comb + x_edu_college +
    x_native + x_employ_cat + x_hhi + x_pid_comb2 +
    x_ideo_cat,
  family = quasibinomial(link = "logit"),
  data = combined_dat %>%
    mutate(
      x_pid_comb2 = factor(x_pid_comb2, levels = c("Independent", "Left", "Right")),
      x_ideo_cat = factor(x_ideo_cat, levels = c("Moderate", "Left", "Right"))
    ) %>%
    filter(sample == "AU", Z_land_info == 0)
)

mfx_au <-
  margins::margins(au_fit, vce = "delta") %>%
  broom::tidy()

x_levels_au <- c(
  mfx_au$term[str_detect(mfx_au$term, "x_gender")],
  mfx_au$term[str_detect(mfx_au$term, "x_age")],
  mfx_au$term[str_detect(mfx_au$term, "x_state")],
  mfx_au$term[str_detect(mfx_au$term, "x_edu")],
  mfx_au$term[str_detect(mfx_au$term, "x_native")],
  "x_employ_catUnemployed", "x_employ_catOther",
  "x_hhi$30,000 - $49,999",   "x_hhi$50,000 - $79,999",
  "x_hhi$80,000 - $99,999", "x_hhi$100,000 - $149,999",
  "x_hhi$150,000 - $199,999", "x_hhi$200,000 +",
  mfx_au$term[str_detect(mfx_au$term, "x_pid")],
  mfx_au$term[str_detect(mfx_au$term, "x_ideo")]
)

x_labels_au <- c(
  "Female",
  "25-34", "35-44", "45-54", "55-64", "65+",
  "Queensland",
  "South Australia/Northern Territory",
  "Victoria/Tasmania",
  "Western Australia",
  "Bachelors degree or higher",
  "Native born",
  "Unemployed", "Other",
  "$30,000-$49,000", "$50,000-$79,999", "$80,000-$99,999",
  "$100,000-$149,999", "$150,000-$199,999", "$200,000 or more",
  "Left partisan", "Right partisan",
  "Left ideology", "Right ideology"
)

est_au <-
  mfx_au %>%
  mutate(
    x_group = case_when(
      str_detect(term, "x_gender") ~ "Gender:",
      str_detect(term, "x_age") ~ "Age:",
      str_detect(term, "x_state") ~ "State/Territory:",
      str_detect(term, "x_edu") ~ "Education:",
      str_detect(term, "x_native") ~ "Birthplace:",
      str_detect(term, "x_employ") ~ "Employment status:",
      str_detect(term, "x_hhi") ~ "Income:",
      str_detect(term, "x_pid") ~ "Partisanship:",
      str_detect(term, "x_ideo") ~ "Ideology:"
    ),
    x_group = factor(
      x_group,
      levels = c("Gender:", "Age:", "State/Territory:", "Education:",
                 "Birthplace:", "Employment status:", "Income:",
                 "Partisanship:", "Ideology:")
    ),
    x_cat = factor(term, levels = x_levels_au, labels = x_labels_au),
    sample = "Australia"
  ) %>%
  select(term, x_group, x_cat, everything()) %>%
  arrange(x_cat)

## Fit for USA
us_fit <- glm(
  y_recall_binary ~ x_gender_f + x_age_cat + x_ethnicity + x_region +
    x_edu_college + x_native + x_employ_cat + x_hhi + x_pid_comb2,
  family = quasibinomial(link = "logit"),
  data = combined_dat %>%
    mutate(
      x_pid_comb2 = factor(x_pid_comb2, levels = c("Independent", "Left", "Right")),
      x_ethnicity = factor(x_ethnicity, levels = c("White", "Latino", "Black", "Asian"))
    ) %>%
    filter(sample == "US", Z_land_info == 0)
)

mfx_us <-
  margins::margins(us_fit, vce = "delta") %>%
  broom::tidy()

x_levels_us <- c(
  mfx_us$term[str_detect(mfx_us$term, "x_gender")],
  mfx_us$term[str_detect(mfx_us$term, "x_age")],
  rev(mfx_us$term[str_detect(mfx_us$term, "x_ethnicity")]),
  mfx_us$term[str_detect(mfx_us$term, "x_region")],
  mfx_us$term[str_detect(mfx_us$term, "x_edu")],
  mfx_us$term[str_detect(mfx_us$term, "x_native")],
  "x_employ_catUnemployed", "x_employ_catOther",
  "x_hhi$30,000 - $49,999",   "x_hhi$50,000 - $79,999",
  "x_hhi$80,000 - $99,999", "x_hhi$100,000 - $149,999",
  "x_hhi$150,000 - $199,999", "x_hhi$200,000 +",
  mfx_us$term[str_detect(mfx_us$term, "x_pid")]
)

x_labels_us <- c(
  "Female",
  "25-34", "35-44", "45-54", "55-64", "65+",
  "Latino", "Black", "Asian",
  "Northeast", "South", "West",
  "Bachelors degree or higher",
  "Native born",
  "Unemployed", "Other",
  "$30,000-$49,000", "$50,000-$79,999", "$80,000-$99,999",
  "$100,000-$149,999", "$150,000-$199,999", "$200,000 or more",
  "Left partisan", "Right partisan"
)

est_us <-
  mfx_us %>%
  mutate(
    x_group = case_when(
      str_detect(term, "x_gender") ~ "Gender:",
      str_detect(term, "x_age") ~ "Age:",
      str_detect(term, "x_ethnicity") ~ "Race/Ethnicity:",
      str_detect(term, "x_region") ~ "Region:",
      str_detect(term, "x_edu") ~ "Education:",
      str_detect(term, "x_native") ~ "Birthplace:",
      str_detect(term, "x_employ") ~ "Employment status:",
      str_detect(term, "x_hhi") ~ "Income:",
      str_detect(term, "x_pid") ~ "Partisanship:"
    ),
    x_group = factor(
      x_group,
      levels = c("Gender:", "Age:", "Race/Ethnicity:", "Region:", "Education:",
                 "Birthplace:", "Employment status:", "Income:", "Partisanship:",
                 "Ideology:")
    ),
    x_cat = factor(term, levels = x_levels_us, labels = x_labels_us),
    sample = "United States"
  ) %>%
  select(term, x_group, x_cat, everything()) %>%
  arrange(x_cat)

## Combine and export as table for SI
est_combined <-
  bind_rows(est_au, est_us) %>%
  mutate(
    entry = table_entry(est = estimate, se = std.error),
    pval  = case_when(
      p.value < 0.001 ~ "< 0.001",
      .default        = sprintf("%.3f", p.value)
    )
  ) %>%
  select(sample, x_group, x_cat, entry, pval) %>%
  pivot_wider(names_from = sample, values_from = c(entry, pval),
              values_fill = "") %>%
  select(x_group, x_cat, entry_Australia, pval_Australia,
         `entry_United States`, `pval_United States`)

est_combined %>%
  xtable() %>%
  print(include.rownames = FALSE, include.colnames = FALSE, only.contents = TRUE,
        type = "latex", sanitize.text.function = identity, comment = FALSE,
        hline.after = NULL, file = paste0(tab_dir, "table_s21.tex"))

## ---- Table S22: Australian sample vs. national polls on reconciliation issues ----
au_polls <- read_rds("au_national_polls.rds")
au_ctrl  <- combined_dat %>% filter(sample == "AU", Z_land_info == 0)

# Helper: run lm_robust and return tidy estimates
lm_est <- function(formula) {
  tidy(lm_robust(formula, data = au_ctrl)) %>%
    filter(term == "(Intercept)") %>%
    select(estimate, se = std.error, ci_lo = conf.low, ci_hi = conf.high)
}

## National poll estimates: CI = estimate +/- margin of error; 
## SE = error_margin / 1.96
polls_fmt <-
  au_polls %>%
  mutate(
    se          = error_margin / 1.96,
    ci_lo       = estimate - error_margin,
    ci_hi       = estimate + error_margin,
    source_date = paste0(source, " (", date, ")")
  ) %>%
  select(question, estimate, se, ci_lo, ci_hi, source_date)

# Current sample estimates matched to questions by keyword
sample_fmt <-
  au_polls %>%
  distinct(question) %>%
  mutate(
    formula_str = case_when(
      str_detect(question, regex("truth",     ignore_case = TRUE)) ~ "I(y_substantive_q2_n > 4) ~ 1",
      str_detect(question, regex("australia", ignore_case = TRUE)) ~ "I(y_symbolic_q2_n > 4) ~ 1",
      str_detect(question, regex("sport",     ignore_case = TRUE)) ~ "I(y_symbolic_q3_n > 4) ~ 1",
      str_detect(question, regex("naming",    ignore_case = TRUE)) ~ "I(y_symbolic_q4_n > 4) ~ 1"
    )
  ) %>%
  mutate(result = map(formula_str, ~ lm_est(as.formula(.x)))) %>%
  unnest(result) %>%
  mutate(source_date = "Current sample (October 2023)") %>%
  select(question, estimate, se, ci_lo, ci_hi, source_date)

# Combine: current sample first within each question group
bind_rows(sample_fmt, polls_fmt) %>%
  arrange(question, source_date != "Current sample (October 2023)") %>%
  mutate(
    entry = paste0(round(estimate, 2), " (", round(se, 2), ")"),
    ci    = paste0("[", round(ci_lo, 2), ", ", round(ci_hi, 2), "]")
  ) %>%
  select(source_date, entry, ci) %>%
  xtable() %>%
  print(include.rownames = FALSE, include.colnames = FALSE, only.contents = TRUE,
        type = "latex", sanitize.text.function = identity, comment = FALSE,
        hline.after = NULL, file = paste0(tab_dir, "table_s22.tex"))

## ---- Session info -----------------------------------------------------------
sessionInfo()
